library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(RColorBrewer)
library(nloptr)
library(reshape2)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(bnlearn)
## 
## Attaching package: 'bnlearn'
## The following object is masked from 'package:stats':
## 
##     sigma
library(stringr)
df.tcr = read.table("../result/backbone.txt", header = T, sep = "\t") %>%
  mutate(tcr_chain = as.factor(substr(as.character(tcr_v_allele), 1, 3)))

df.ag = read.table("../result/backbone_ag.txt", header = T, sep = "\t")

df.master = read.table("../result/structure.txt", header=T, sep="\t") %>%
  filter(tcr_region %in% c("CDR1", "CDR2", "CDR3")) %>%
  mutate(tcr_chain = as.factor(substr(as.character(tcr_v_allele), 1, 3))) %>%
  droplevels

Backbone

Overview

Modelling \(x_i = f(l, L)\) for \(C\alpha\) atoms. CDRs and antigen structures are transformed as follows:

  • C terminal residue is used as origin
  • Coordinates are rotated so that C-N terminus direction is aligned with X axis
  • YZ projection of the center of mass is aligned with Z axis

First lets have a look at these functions for TCR

ggplot(df.tcr, aes(x = pos_tcr / (len_tcr - 1), y = x)) +
  geom_line(aes(group = pdb_id, color = len_tcr), alpha = 0.8) +
  scale_color_gradientn(colors=colorRampPalette(rev(brewer.pal(11, 'Spectral')))(32)) +
  facet_grid(tcr_region~tcr_chain)

ggplot(df.tcr, aes(x = pos_tcr / (len_tcr - 1), y = y)) +
  geom_line(aes(group = pdb_id, color = len_tcr), alpha = 0.8) +
  scale_color_gradientn(colors=colorRampPalette(rev(brewer.pal(11, 'Spectral')))(32)) +
  facet_grid(tcr_region~tcr_chain)

ggplot(df.tcr, aes(x = pos_tcr / (len_tcr - 1), y = z)) +
  geom_line(aes(group = pdb_id, color = len_tcr), alpha = 0.8) +
  scale_color_gradientn(colors=colorRampPalette(rev(brewer.pal(11, 'Spectral')))(32)) +
  facet_grid(tcr_region~tcr_chain)

R direction

ggplot(df.tcr, aes(x = pos_tcr, y = z_cb - z)) +
  geom_line(aes(group = interaction(pdb_id, tcr_chain)), alpha = 0.5) +
  scale_color_gradientn(colors=colorRampPalette(rev(brewer.pal(11, 'Spectral')))(32)) +
  facet_wrap(~len_tcr)

and antigen

ggplot(df.ag, aes(x = pos_ag / (len_ag - 1), y = x)) +
  geom_line(aes(group = pdb_id, color = len_ag), alpha = 0.8) +
  scale_color_gradientn(colors=colorRampPalette(rev(brewer.pal(11, 'Spectral')))(32)) +
  facet_grid(.~mhc_type)

ggplot(df.ag, aes(x = pos_ag / (len_ag - 1), y = y)) +
  geom_line(aes(group = pdb_id, color = len_ag), alpha = 0.8) +
  scale_color_gradientn(colors=colorRampPalette(rev(brewer.pal(11, 'Spectral')))(32)) +
  facet_grid(.~mhc_type)

ggplot(df.ag, aes(x = pos_ag / (len_ag - 1), y = z)) +
  geom_line(aes(group = pdb_id, color = len_ag), alpha = 0.8) +
  scale_color_gradientn(colors=colorRampPalette(rev(brewer.pal(11, 'Spectral')))(32)) +
  facet_grid(.~mhc_type)

R direction

ggplot(df.ag, aes(x = pos_ag, y = z_cb - z)) +
  geom_line(aes(group = pdb_id), alpha = 0.5) +
  facet_wrap(~len_ag)

Fitting XYZ with Beta and linear functions

Functions to fit

calc_x = function(l, L, params) {
  H = params[1] + params[2] * L
  a = params[3]
  b = params[4]
  d = params[5] + params[6] * L # distance from C to F/W
  x = l / (L - 1)
  return(x * d + H * x ^ (a - 1) * (1 - x) ^ (b - 1) / beta(a, b))
}

param_ranges_x = data.frame(axis = "x",
                            x0 = c(10, 0.5, 1.5, 1.5, 5, 0.5),
                            lb = c(0, 0, 1.001, 1.001, 0, 0),
                            ub = c(20, 5, 10, 10, 50, 5))

calc_y = function(l, L, params) {
  H = params[1] + params[2] * L
  a1 = params[3]
  b1 = params[4]
  a2 = params[5]
  b2 = params[6]
  x = l / (L - 1)
  return(H * (x ^ (a2 - 1) * (1 - x) ^ (b2 - 1) / beta(a2, b2) - x ^ (a1 - 1) * (1 - x) ^ (b1 - 1) / beta(a1, b1)))
}

param_ranges_y = data.frame(axis = "y",
                            x0 = c(10, 0.5, 2, 5, 5, 2),
                            lb = c(0, 0, 1.001, 1.001, 1.001, 1.001),
                            ub = c(20, 2, 10, 10, 10, 10))

calc_z = function(l, L, params) {
  H = params[1] + params[2] * L
  a = params[3]
  b = params[4]
  x = l / (L - 1)
  return(H * x ^ (a - 1) * (1 - x) ^ (b - 1) / beta(a, b))
}

param_ranges_z = data.frame(axis = "z",
                            x0 = c(10, 0.5, 1.5, 1.5),
                            lb = c(0, 0, 1.001, 1.001),
                            ub = c(20, 2, 10, 10))

params_cdr = rbind(param_ranges_x, param_ranges_y, param_ranges_z)


calc_coord = function(l, L, params, axis) {
   switch(axis,
        x = calc_x(l, L, params),
        y = calc_y(l, L, params),
        z = calc_z(l, L, params))
}

Fitting CDR loops

Optimization goes here

params_fitted_tcr = data.frame()

for (t in list(c("CDR1", "CDR2"), "CDR3")) {
  for (a in c("x", "y", "z")) {
    print(paste("Fitting", paste(t), a))
    
    # dplyr::select starting params and bounds
    params_nloptr = params_cdr %>% filter(axis == a)
    
    # prepare data
    df.fit = df.tcr %>% filter(tcr_region %in% t)
    df.fit$value = df.fit[,a]
    df.fit$pos = df.fit$pos_tcr
    df.fit$len = df.fit$len_tcr
    
    # generic objective
    obj = function(params, data) with(data, mean((value - calc_coord(pos, len, params, a))^2))
    
    res = with(params_nloptr,
               nloptr(x0 = x0, lb = lb, ub = ub,
                      eval_f = obj,
                      data = df.fit,
                      opts = list(algorithm = "NLOPT_LN_SBPLX", maxeval = 1e5)))
    
    for (tt in t) {
      solution = data.frame(tcr_region = tt,
                            axis = a,
                            params = res$solution)
      params_fitted_tcr = rbind(params_fitted_tcr, solution)
    }
    
    print(res)
    }
}
## [1] "Fitting CDR1 x" "Fitting CDR2 x"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 3033 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  3.06112324554689 
## Optimal value of controls: 0.3111275 0 10 3.138592 4.203514 0.8356583
## 
## 
## [1] "Fitting CDR1 y" "Fitting CDR2 y"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 6524 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.735020819436763 
## Optimal value of controls: 0 2 3.995074 3.347118 4.072782 3.340119
## 
## 
## [1] "Fitting CDR1 z" "Fitting CDR2 z"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 8483 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.862567735062751 
## Optimal value of controls: 0 0.5090817 2.073567 2.124372
## 
## 
## [1] "Fitting CDR3 x"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 5017 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  4.20342129647816 
## Optimal value of controls: 1.169303 0.1893118 2.172392 1.839203 5.88275 0
## 
## 
## [1] "Fitting CDR3 y"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 40185 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  2.0577584357609 
## Optimal value of controls: 7.398926 0.2933037 3.519635 2.159706 4.353659 2.455754
## 
## 
## [1] "Fitting CDR3 z"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 941 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  1.30010418614403 
## Optimal value of controls: 1.560094 0.5181191 2.392882 2.431665

Check solutions

calc_coord_tcr_fitted = function(l, L, tcr_region, axis) {
  t = as.character(tcr_region[1])
  a = as.character(axis[1])
  params = (params_fitted_tcr %>% filter(tcr_region == t, axis == a))$params
  
  calc_coord(l, L, params, a)
}

df.tcr.melt = melt(df.tcr, measure.vars = c("x", "y", "z"))

df.tcr.melt = df.tcr.melt %>%
  group_by(tcr_region, variable) %>%
  mutate(value_fit = calc_coord_tcr_fitted(pos_tcr, len_tcr, tcr_region, variable))
ggplot(df.tcr.melt %>% filter(variable == "x"), aes(x = pos_tcr / (len_tcr - 1))) +
  geom_line(aes(group = interaction(pdb_id, tcr_chain, tcr_region), y = value), alpha = 0.8) +
  geom_line(aes(group = tcr_region, y = value_fit), color = "red") +
  ylab("x") +
  facet_wrap(~len_tcr)

ggplot(df.tcr.melt %>% filter(variable == "y"), aes(x = pos_tcr / (len_tcr - 1))) +
  geom_line(aes(group = interaction(pdb_id, tcr_chain, tcr_region), y = value), alpha = 0.8) +
  geom_line(aes(group = tcr_region, y = value_fit), color = "red") +
  ylab("y") +
  facet_wrap(~len_tcr)

ggplot(df.tcr.melt %>% filter(variable == "z"), aes(x = pos_tcr / (len_tcr - 1))) +
  geom_line(aes(group = interaction(pdb_id, tcr_chain, tcr_region), y = value), alpha = 0.8) +
  geom_line(aes(group = tcr_region, y = value_fit), color = "red") +
  ylab("z") +
  facet_wrap(~len_tcr)

Fitting antigen peptide

Optimization goes here

params_fitted_ag = data.frame()

for (m in c("MHCI", "MHCII")) {
  for (a in c("x", "y", "z")) {
    print(paste("Fitting", m, a))
    
    # dplyr::select starting params and bounds
    params_nloptr = params_cdr %>% filter(axis == a)
    
    # prepare data
    df.fit = df.ag %>% filter(mhc_type %in% m)
    df.fit$value = df.fit[,a]
    df.fit$pos = df.fit$pos_ag
    df.fit$len = df.fit$len_ag
    
    # generic objective
    obj = function(params, data) with(data, mean((value - calc_coord(pos, len, params, a))^2))
    
    res = with(params_nloptr,
               nloptr(x0 = x0, lb = lb, ub = ub,
                      eval_f = obj,
                      data = df.fit,
                      opts = list(algorithm = "NLOPT_LN_SBPLX", maxeval = 1e5)))
    
    solution = data.frame(mhc_type = m,
                          axis = a,
                          params = res$solution)
    params_fitted_ag = rbind(params_fitted_ag, solution)
    
    print(res)
    }
}
## [1] "Fitting MHCI x"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 1012 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  1.36767361381065 
## Optimal value of controls: 0 0.03043346 2.107937 10 16.19432 0.6542352
## 
## 
## [1] "Fitting MHCI y"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 2567 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.667542606283011 
## Optimal value of controls: 0.4680786 0 1.95011 1.001 10 1.435702
## 
## 
## [1] "Fitting MHCI z"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 4407 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  1.66946869482074 
## Optimal value of controls: 0 0.3534113 2.992723 2.545585
## 
## 
## [1] "Fitting MHCII x"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 2884 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.764408744994204 
## Optimal value of controls: 0 0.03467443 4.449276 2.066645 1.613794 2.626012
## 
## 
## [1] "Fitting MHCII y"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 3169 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  1.3347037919981 
## Optimal value of controls: 0 0.009004148 3.204594 10 1.22578 1.064003
## 
## 
## [1] "Fitting MHCII z"
## 
## Call:
## 
## nloptr(x0 = x0, eval_f = obj, lb = lb, ub = ub, opts = list(algorithm = "NLOPT_LN_SBPLX", 
##     maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 2672 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  2.73296867021219 
## Optimal value of controls: 0 0.227257 1.843548 1.766022

Check solutions

calc_coord_ag_fitted = function(l, L, mhc_type, axis) {
  m = as.character(mhc_type[1])
  a = as.character(axis[1])
  params = (params_fitted_ag %>% filter(mhc_type == m, axis == a))$params
  
  calc_coord(l, L, params, a)
}

df.ag.melt = melt(df.ag, measure.vars = c("x", "y", "z"))

df.ag.melt = df.ag.melt %>%
  group_by(mhc_type, variable) %>%
  mutate(value_fit = calc_coord_ag_fitted(pos_ag, len_ag, mhc_type, variable))
ggplot(df.ag.melt %>% filter(variable == "x"), aes(x = pos_ag / (len_ag - 1))) +
  geom_line(aes(group = pdb_id, y = value), alpha = 0.8) +
  geom_line(aes(group = mhc_type, y = value_fit), color = "red") +
  ylab("x") +
  facet_wrap(~len_ag)

ggplot(df.ag.melt %>% filter(variable == "y"), aes(x = pos_ag / (len_ag - 1))) +
  geom_line(aes(group = pdb_id, y = value), alpha = 0.8) +
  geom_line(aes(group = mhc_type, y = value_fit), color = "red") +
  ylab("y") +
  facet_wrap(~len_ag)

ggplot(df.ag.melt %>% filter(variable == "z"), aes(x = pos_ag / (len_ag - 1))) +
  geom_line(aes(group = pdb_id, y = value), alpha = 0.8) +
  geom_line(aes(group = mhc_type, y = value_fit), color = "red") +
  ylab("z") +
  facet_wrap(~len_ag)

Optimizing orientation

Parameters of distance function specify separation across Z axis and yaw and pitch rotation

calc_dist = function(x_tcr, y_tcr, z_tcr, x_ag, y_ag, z_ag, x_tcr_max, x_ag_max, params) {
  z_tcr = -z_tcr + params[1]
  x_tcr = x_tcr - x_tcr_max / 2
  x_ag = x_ag - x_ag_max / 2
  
  cs1 = cos(params[2])
  ss1 = sin(params[2])
  cs2 = cos(params[3])
  ss2 = sin(params[3])
  
  x_tcr = x_tcr * cs1 - y_tcr * ss1
  y_tcr = x_tcr * ss1 + y_tcr * cs1
  x_tcr = x_tcr * cs2 - z_tcr * ss2
  z_tcr = x_tcr * ss2 + z_tcr * cs2
  
  return(sqrt((x_tcr - x_ag) ^ 2 + (y_tcr - y_ag) ^ 2 + (z_tcr - z_ag) ^ 2))
}

Optimization goes here

params_fitted_orientation = data.frame()

df.distest.summary = df.master %>%
  group_by(pdb_id, tcr_chain, tcr_region, mhc_type) %>%
  summarize(bad_entity = min(distance_CA) >= 30)

df.distest = merge(df.master, df.distest.summary) %>% filter(!bad_entity)

for (t in c("CDR1", "CDR2", "CDR3")) {
  for (ch in c("TRA", "TRB")) {
    for (m in c("MHCI", "MHCII")) {
      print(paste("Fitting orientation for", t, ch, m))
      
      df.fit = df.distest %>%
        filter(tcr_region == t, tcr_chain == ch, mhc_type == m) %>%
        dplyr::select(pdb_id, pos_tcr, len_tcr, pos_antigen, len_antigen, distance_CA)
      
      df.fit$x_tcr = calc_coord_tcr_fitted(df.fit$pos_tcr, df.fit$len_tcr, t, "x")
      df.fit$y_tcr = calc_coord_tcr_fitted(df.fit$pos_tcr, df.fit$len_tcr, t, "y")
      df.fit$z_tcr = calc_coord_tcr_fitted(df.fit$pos_tcr, df.fit$len_tcr, t, "z")
      
      df.fit$x_ag = calc_coord_ag_fitted(df.fit$pos_antigen, df.fit$len_antigen, m, "x")
      df.fit$y_ag = calc_coord_ag_fitted(df.fit$pos_antigen, df.fit$len_antigen, m, "y")
      df.fit$z_ag = calc_coord_ag_fitted(df.fit$pos_antigen, df.fit$len_antigen, m, "z")
      
      df.fit = df.fit %>%
        group_by(pdb_id) %>%
        mutate(x_tcr_max = max(x_tcr), x_ag_max = max(x_ag))
      
      obj = function(params, data) with(data, mean((1/distance_CA - 
                                                      1/calc_dist(x_tcr, y_tcr, z_tcr,
                                                                x_ag, y_ag, z_ag, 
                                                                x_tcr_max, x_ag_max,
                                                                params))^2))
      
      res = with(params_nloptr,
               nloptr(x0 = c(20, pi/2, 0), lb = c(0, 0, -pi/2), ub = c(100, 2*pi, pi/2),
                      eval_f = obj,
                      data = df.fit,
                      opts = list(algorithm = "NLOPT_LN_SBPLX", maxeval = 1e5)))
      
      sola = res$solution
      sola[2] = sola[2] / pi * 180
      sola[3] = sola[3] / pi * 180
    
      solution = data.frame(mhc_type = m,
                            tcr_chain = ch,
                            tcr_region = t,
                            params = res$solution,
                            params2 = sola,
                            param_name = c("height", "yaw", "pitch"))
      
      params_fitted_orientation = rbind(params_fitted_orientation, solution)
    
      print(res)
    }
  }
}
## [1] "Fitting orientation for CDR1 TRA MHCI"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 476 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.000281677583924834 
## Optimal value of controls: 25.83908 0 0.566778
## 
## 
## [1] "Fitting orientation for CDR1 TRA MHCII"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 652 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.00029200882916876 
## Optimal value of controls: 28.22229 0 0.5941197
## 
## 
## [1] "Fitting orientation for CDR1 TRB MHCI"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 338 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.00030045425600131 
## Optimal value of controls: 28.6697 3.189475 -0.5691854
## 
## 
## [1] "Fitting orientation for CDR1 TRB MHCII"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 418 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.000306608745791826 
## Optimal value of controls: 30.7937 2.358328 -0.5602491
## 
## 
## [1] "Fitting orientation for CDR2 TRA MHCI"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 359 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.000160408768642331 
## Optimal value of controls: 26.13596 2.009212 0.3564626
## 
## 
## [1] "Fitting orientation for CDR2 TRA MHCII"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 601 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  7.55625440907815e-05 
## Optimal value of controls: 23.59515 4.34877 0.2376385
## 
## 
## [1] "Fitting orientation for CDR2 TRB MHCI"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 1793 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.000269644602436689 
## Optimal value of controls: 24.00763 1.306104 -0.4031764
## 
## 
## [1] "Fitting orientation for CDR2 TRB MHCII"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 208 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.000194616634150638 
## Optimal value of controls: 25.94113 2.020967 -0.389075
## 
## 
## [1] "Fitting orientation for CDR3 TRA MHCI"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 1457 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.000381322969460991 
## Optimal value of controls: 27.64579 4.302396 0.3061143
## 
## 
## [1] "Fitting orientation for CDR3 TRA MHCII"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 1674 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.000369182433703871 
## Optimal value of controls: 32.49867 4.851156 0.4686846
## 
## 
## [1] "Fitting orientation for CDR3 TRB MHCI"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 1154 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.000440011128484609 
## Optimal value of controls: 29.07562 0.8822835 -0.3010867
## 
## 
## [1] "Fitting orientation for CDR3 TRB MHCII"
## 
## Call:
## nloptr(x0 = c(20, pi/2, 0), eval_f = obj, lb = c(0, 0, -pi/2), 
##     ub = c(100, 2 * pi, pi/2), opts = list(algorithm = "NLOPT_LN_SBPLX", 
##         maxeval = 1e+05), data = df.fit)
## 
## 
## Minimization using NLopt version 2.4.2 
## 
## NLopt solver status: 4 ( NLOPT_XTOL_REACHED: Optimization stopped because 
## xtol_rel or xtol_abs (above) was reached. )
## 
## Number of Iterations....: 1072 
## Termination conditions:  maxeval: 1e+05 
## Number of inequality constraints:  0 
## Number of equality constraints:    0 
## Optimal value of objective function:  0.000237298581665191 
## Optimal value of controls: 29.8334 1.793355 -0.3507075

Take a look at fitted values

params_fitted_orientation1 = dcast(params_fitted_orientation, mhc_type + tcr_chain + tcr_region ~ param_name, value.var = "params2")
params_fitted_orientation2 = params_fitted_orientation1
params_fitted_orientation2$yaw = params_fitted_orientation2$yaw + 180 
params_fitted_orientation2$pitch = NA 
params_fitted_orientation1 = rbind(params_fitted_orientation1, params_fitted_orientation2)

params_fitted_orientation1$yaw = ifelse(params_fitted_orientation1$yaw > 360, 
                                        params_fitted_orientation1$yaw - 360,
                                        ifelse(params_fitted_orientation1$yaw < 0,
                                               360 + params_fitted_orientation1$yaw,
                                                            params_fitted_orientation1$yaw))
params_fitted_orientation1$pitch = ifelse(params_fitted_orientation1$pitch > 360, 
                                        params_fitted_orientation1$pitch - 360,
                                        ifelse(params_fitted_orientation1$pitch < 0,
                                               360 + params_fitted_orientation1$pitch,
                                                            params_fitted_orientation1$pitch))

ggplot(params_fitted_orientation1, aes(x = yaw, y = 1, color = tcr_region)) +
  geom_bar(stat="identity") +
  geom_point() +
  coord_polar(theta = "x") +
  scale_x_continuous("yaw (rotation wrt Z axis)", breaks=seq(0, 360, by=30), expand=c(0,0), lim=c(0, 360)) +
  facet_grid(mhc_type ~ tcr_chain)

ggplot(params_fitted_orientation1, aes(x = pitch, y = height, color = tcr_region)) +
  geom_bar(stat="identity") +
  geom_point() +
  coord_polar(theta = "x") +
  scale_x_continuous("pitch (rotation wrt Y axis)", breaks=seq(0, 360, by=30), expand=c(0,0), lim=c(0, 360)) +
  facet_grid(mhc_type ~ tcr_chain)
## Warning: Removed 12 rows containing missing values (position_stack).
## Warning: Removed 12 rows containing missing values (geom_point).

General purpose function for computing distances

compute_distance_slave = function(pdb_id, pos_tcr, len_tcr, pos_antigen, len_antigen, tcr_chain, tcr_region, mhc_type) {
  tc = as.character(tcr_chain[1])
  tr = as.character(tcr_region[1])
  mt = as.character(mhc_type[1])
  
  x_tcr = calc_coord_tcr_fitted(pos_tcr, len_tcr, tr, "x")
  y_tcr = calc_coord_tcr_fitted(pos_tcr, len_tcr, tr, "y")
  z_tcr = calc_coord_tcr_fitted(pos_tcr, len_tcr, tr, "z")
      
  x_ag = calc_coord_ag_fitted(pos_antigen, len_antigen, mt, "x")
  y_ag = calc_coord_ag_fitted(pos_antigen, len_antigen, mt, "y")
  z_ag = calc_coord_ag_fitted(pos_antigen, len_antigen, mt, "z")
      
  max_x = data.frame(pdb_id, x_tcr, x_ag) %>%
    group_by(pdb_id) %>%
    mutate(x_tcr_max = max(x_tcr), x_ag_max = max(x_ag))
  
  params = (params_fitted_orientation %>%
    filter(tcr_chain == tc & tcr_region == tr & mhc_type == mt))$params
      
  calc_dist(x_tcr, y_tcr, z_tcr, x_ag, y_ag, z_ag, max_x$x_tcr_max, max_x$x_ag_max, params)
}

compute_distances_master = function(data) {
  data %>% 
    group_by(tcr_chain, tcr_region, mhc_type) %>%
    mutate(distance_est = compute_distance_slave(pdb_id, pos_tcr, len_tcr, pos_antigen, len_antigen, tcr_chain, tcr_region, mhc_type))
}

df.distest = compute_distances_master(df.distest)

Now lets take a look at overall results

ggplot(df.distest, aes(x = as.integer(distance_CA), group = as.integer(distance_CA), y = distance_est)) +
  geom_boxplot() +
  facet_grid(mhc_type~tcr_chain+tcr_region) +
  geom_abline(slope = 1, intercept = 0, color="red", linetype="dashed") +
  scale_x_continuous(limits = c(0, 30)) +
  scale_y_continuous(limits = c(0, 30)) +
  theme_bw()
## Warning: Removed 4477 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

Contact probabilities

We will use the data frame with estimated distances from previous section. Contacts are defined as amino acid pairs that have interaction energy less than \(0\). A simple naive bayes network is used to fit the values.

E_NOISE_THRESHOLD = 0

df.bn.orig = df.distest %>%
  mutate(contact = energy < E_NOISE_THRESHOLD, contact_f = as.factor(contact), 
         energy = ifelse(energy > 0, 0, energy),
         aa_pair = as.factor(ifelse(as.character(aa_tcr) < as.character(aa_antigen), 
                                    paste(aa_tcr, aa_antigen, sep = "_"), paste(aa_antigen, aa_tcr, sep = "_")))) 

df.bn = df.bn.orig %>%
  filter(!is.na(contact)) %>%
  droplevels

df.bn.fc = df.bn %>%
  group_by(pdb_id) %>%
  summarize(few_contacts = sum(contact) < 5) %>%
  filter(few_contacts)

df.bn = df.bn %>%
  filter(!(pdb_id %in% df.bn.fc$pdb_id))

Plot distance distribution for contacts and non-contacts:

ggplot(df.bn, aes(x = distance_CA, fill = contact)) +
  geom_histogram(binwidth = 1) +
  facet_grid(mhc_type~tcr_chain+tcr_region, scales="free_y") +
  theme_bw()

Focus on TCR amino acid

ggplot(df.bn, aes(x = distance_est, fill = contact)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(~aa_tcr, scales="free_y") +
  theme_bw()

Focus on TCR amino acid, real energy values, color by antigen amino acid

ggplot(df.bn, aes(x = distance_est, y = energy, fill = aa_antigen)) +
  geom_point(shape=21) +
  facet_wrap(~aa_tcr) +
  theme_bw() + 
  theme(legend.position="bottom") + guides(fill=guide_legend(nrow=2, byrow=T))

ggplot(df.bn %>% filter(energy < 0 & aa_tcr == "R"), aes(x = distance_est, y = energy)) +
  geom_point(shape=21) +
  facet_wrap(~aa_antigen) +
  theme_bw()

Bayes model

Fitting the model

Distance discretization, all residues with \(d > 21\) automatically get \(0\) contact probability. Distances from \(5..21\) are put into \(2\) angstrom bins

ggplot(subset(df.bn, distance_est <= 21), aes(x = pmax(5, distance_est), fill = contact)) +
  geom_histogram(binwidth = 2) +
  facet_grid(contact~., scales = "free_y")

Prepare dataset once more

df.bn.fit = df.bn %>%
  filter(distance_est <= 21)

df.bn.fit = as.data.frame(df.bn.fit)

df.bn.fit$distance_est_f = as.factor(as.integer(pmax(0, df.bn.fit$distance_est - 5) / 2) + 1)
df.bn.fit$odd_tcr = as.factor(df.bn.fit$pos_tcr %% 2 == 1)
df.bn.fit$odd_antigen = as.factor(df.bn.fit$pos_antigen %% 2 == 1)

levels(df.bn.fit$distance_est_f)
## [1] "1" "2" "3" "4" "5" "6" "7" "8"

Our very simple model

emp_net = model2network(paste(
  "[contact_f]",
  "[aa_pair|contact_f]",
  "[odd_tcr|contact_f]",
  "[odd_antigen|contact_f]",
  "[distance_est_f|odd_tcr:odd_antigen:contact_f]",
  sep =""))

graphviz.plot(emp_net)
## Loading required namespace: Rgraphviz

Fit the model

df.bn.fit1 = df.bn.fit %>% dplyr::select(aa_pair, 
                                         distance_est_f,
                                         odd_tcr,
                                         odd_antigen,
                                         contact_f)

fit = bn.fit(emp_net,
             df.bn.fit1, method="bayes")

BIC(fit, df.bn.fit1)
## [1] -322586.5

Exploring performance

Compute the probabilities and plot the ROC curve

res = predict(fit, node="contact_f", method="bayes-lw", 
              data=df.bn.fit1, prob=T)

p = attributes(res)$prob

rocobj = plot.roc(df.bn.fit1[,"contact_f"], p[2,], ci=T)

rocobj
## 
## Call:
## plot.roc.default(x = df.bn.fit1[, "contact_f"], predictor = p[2,     ], ci = T)
## 
## Data: p[2, ] in 32111 controls (df.bn.fit1[, "contact_f"] FALSE) < 5170 cases (df.bn.fit1[, "contact_f"] TRUE).
## Area under the curve: 0.861
## 95% CI: 0.856-0.866 (DeLong)

By region and chain

df.bn.fit$p = p[2,]

df.bn.fit.sum = df.bn.fit %>%
  group_by(pdb_id, tcr_chain, mhc_type, tcr_region) %>%
  summarize(contacts = sum(contact), contacts_est = sum(p))

ggplot(df.bn.fit.sum, aes(x = contacts_est, y = contacts,color=tcr_chain,shape =mhc_type)) +
  geom_point() + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  facet_wrap(~tcr_region,scales="free")

summary(lm(contacts~contacts_est, subset(df.bn.fit.sum, tcr_region == "CDR3")))
## 
## Call:
## lm(formula = contacts ~ contacts_est, data = subset(df.bn.fit.sum, 
##     tcr_region == "CDR3"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.6066  -3.4295   0.4862   4.4030  13.2580 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.38980    1.17112   7.164 1.14e-11 ***
## contacts_est  0.49607    0.06758   7.340 3.99e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.49 on 222 degrees of freedom
## Multiple R-squared:  0.1953, Adjusted R-squared:  0.1917 
## F-statistic: 53.88 on 1 and 222 DF,  p-value: 3.989e-12

By chain

df.bn.fit.sum1 = df.bn.fit %>%
  group_by(pdb_id, tcr_chain, mhc_type) %>%
  summarize(contacts = sum(contact), contacts_est = sum(p))

ggplot(df.bn.fit.sum1, aes(x = contacts_est, y = contacts, color=tcr_chain, shape =mhc_type)) +
  geom_point() + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed")

summary(lm(contacts~contacts_est, df.bn.fit.sum1))
## 
## Call:
## lm(formula = contacts ~ contacts_est, data = df.bn.fit.sum1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29.9734  -3.6007   0.2963   5.0598  18.0041 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   15.2019     1.5438   9.847  < 2e-16 ***
## contacts_est   0.3387     0.0627   5.402 1.69e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.576 on 222 degrees of freedom
## Multiple R-squared:  0.1162, Adjusted R-squared:  0.1122 
## F-statistic: 29.18 on 1 and 222 DF,  p-value: 1.693e-07

Overall

df.bn.fit.sum2 = df.bn.fit %>%
  group_by(pdb_id, mhc_type) %>%
  summarize(contacts = sum(contact), contacts_est = sum(p))

ggplot(df.bn.fit.sum2, aes(x = contacts_est, y = contacts, shape =mhc_type)) +
  geom_point() + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed")

summary(lm(contacts~contacts_est, df.bn.fit.sum2))
## 
## Call:
## lm(formula = contacts ~ contacts_est, data = df.bn.fit.sum2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.344  -6.248   1.076   8.035  23.136 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  31.98989    3.58490   8.923 1.14e-14 ***
## contacts_est  0.30460    0.07385   4.125 7.24e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.83 on 110 degrees of freedom
## Multiple R-squared:  0.1339, Adjusted R-squared:  0.1261 
## F-statistic: 17.01 on 1 and 110 DF,  p-value: 7.237e-05

Exploring fitted model

TODO

Contact energies

Compute mean energies (for contacts only)

df.e = df.bn %>% 
  filter(contact) %>%
  group_by(aa_pair) %>%
  summarize(E = mean(energy))

df.e$aa_tcr = str_split_fixed(df.e$aa_pair, "_", 2)[, 1]
df.e$aa_antigen = str_split_fixed(df.e$aa_pair, "_", 2)[ ,2]

df.e2 = df.e
df.e2$aa_tcr = df.e$aa_antigen
df.e2$aa_antigen = df.e$aa_tcr

df.e = rbind(df.e, df.e2)
df.e2 = NULL

ggplot(df.e, aes(x=aa_tcr, y = aa_antigen, fill=E)) +
  geom_tile() + 
  scale_fill_gradientn(colors=colorRampPalette(rev(brewer.pal(9, 'YlOrRd')))(32)) +
  theme_bw()

Merge with BN predictions

df.final = merge(df.bn.fit, df.e, all.y = T)
df.final$E[is.na(df.final$E)] = 0

df.final = df.final %>%
  mutate(pE = p * E)

Summarize energies

df.final.sum = df.final %>%
  group_by(pdb_id, tcr_chain, mhc_type, tcr_region) %>%
  summarize(E = sum(energy), E_est = sum(pE))

ggplot(df.final.sum, aes(x = E_est, y = E, color=tcr_chain,shape =mhc_type)) +
  geom_point() + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  facet_wrap(~tcr_region,scales="free")

summary(lm(E~E_est, subset(df.final.sum, tcr_region == "CDR3")))
## 
## Call:
## lm(formula = E ~ E_est, data = subset(df.final.sum, tcr_region == 
##     "CDR3"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -141.22  -24.51    3.36   28.00   90.23 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -32.63521    5.93679  -5.497 1.06e-07 ***
## E_est         0.52684    0.08377   6.289 1.67e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.96 on 222 degrees of freedom
## Multiple R-squared:  0.1512, Adjusted R-squared:  0.1474 
## F-statistic: 39.55 on 1 and 222 DF,  p-value: 1.675e-09

Example complex

df.final.GLC = df.final %>%
  filter(antigen_seq == "GLCTLVAML") %>%
  droplevels()

ggplot(df.final.GLC, aes(x=pos_tcr, y=pos_antigen)) +
  geom_tile(fill=NA) +
  geom_label(aes(label=paste(aa_tcr, aa_antigen, sep=":"), fill = p), cex=1.3) +
  geom_point(aes(color=contact)) +
  scale_x_continuous(breaks=0:20) +
  scale_y_continuous(breaks=0:20) +
  scale_fill_gradient("P", 
                      low="white", high="#045a8d") +
  scale_color_manual(values = c(NA, "red")) +
  facet_grid(tcr_chain ~ tcr_region, scales="free", space="free") +
  theme_bw()
## Warning: Removed 241 rows containing missing values (geom_point).

ggplot(df.final.GLC, aes(x=pos_tcr, y=pos_antigen)) +
  geom_tile(fill=NA) +
  geom_label(aes(label=paste(aa_tcr, aa_antigen, sep=":"), fill = pE), cex=1.3) +
  geom_point(aes(color=contact, size = -energy), shape = 21) +
  scale_x_continuous(breaks=0:20) +
  scale_y_continuous(breaks=0:20) +
  scale_fill_gradient("E", 
                      low="#045a8d", high="white") +
  scale_color_manual(values = c(NA, "red")) +
  facet_grid(tcr_chain ~ tcr_region, scales="free", space="free") +
  theme_bw()
## Warning: Removed 241 rows containing missing values (geom_point).